{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/husein/dev/malaya/malaya/tokenizer.py:208: FutureWarning: Possible nested set at position 3386\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n",
      "/home/husein/dev/malaya/malaya/tokenizer.py:208: FutureWarning: Possible nested set at position 3904\n",
      "  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import malaya\n",
    "from tqdm import tqdm\n",
    "from unidecode import unidecode\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    splitted = malaya.text.function.split_into_sentences(string)\n",
    "    if not len(splitted):\n",
    "        splitted = '. '.join([k.strip() for k in string.split('.') if len(k.strip())])\n",
    "    if splitted[0][0] == '-':\n",
    "        splitted[0] = splitted[0].replace('- ','')\n",
    "    points = [f'{no + 1}. {s}' for no, s in enumerate(splitted)]\n",
    "    points = ' '.join(points)\n",
    "    return points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "from malaya.text.vectorizer import SkipGramCountVectorizer, SkipGramTfidfVectorizer\n",
    "from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation\n",
    "\n",
    "stopwords = malaya.text.function.get_stopwords()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = SkipGramCountVectorizer(\n",
    "    max_df = 0.95,\n",
    "    min_df = 1,\n",
    "    ngram_range = (1, 3),\n",
    "    stop_words = stopwords,\n",
    "    skip = 2\n",
    ")\n",
    "svd = TruncatedSVD(n_components = 30)\n",
    "model = malaya.summarization.extractive.sklearn(svd, vectorizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = SkipGramTfidfVectorizer(\n",
    "    max_df = 0.95,\n",
    "    min_df = 1,\n",
    "    ngram_range = (1, 3),\n",
    "    stop_words = stopwords,\n",
    "    skip = 2\n",
    ")\n",
    "svd = TruncatedSVD(n_components = 30)\n",
    "model_tfidf = malaya.summarization.extractive.sklearn(svd, vectorizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "months = {\n",
    "    'january',\n",
    "    'jan',\n",
    "    'januari',\n",
    "    'february',\n",
    "    'feb',\n",
    "    'februari',\n",
    "    'march',\n",
    "    'mac',\n",
    "    'april',\n",
    "    'apr',\n",
    "    'may',\n",
    "    'mei',\n",
    "    'june',\n",
    "    'jun',\n",
    "    'july',\n",
    "    'julai',\n",
    "    'august',\n",
    "    'ogos',\n",
    "    'aug',\n",
    "    'september',\n",
    "    'sep',\n",
    "    'october',\n",
    "    'oktober',\n",
    "    'oct',\n",
    "    'november',\n",
    "    'nov',\n",
    "    'december',\n",
    "    'disember',\n",
    "    'dec',\n",
    "    'utusan',\n",
    "    'malaysiakini',\n",
    "    'astroawani',\n",
    "    'bernama',\n",
    "    'com',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def simple_cleaning(string):\n",
    "    return re.sub(r'[ ]+', ' ', unidecode(string).replace('\\n', ' ').replace('--', ' ').replace('/', ' ')).strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['berita-bisnes.json.nested.semisupervised',\n",
       " 'berita-dunia.json.nested.semisupervised',\n",
       " 'berita-hiburan.json.nested.semisupervised',\n",
       " 'berita-malaysia.json.nested.semisupervised',\n",
       " 'berita-politik.json.nested.semisupervised',\n",
       " 'berita-sukan.json.nested.semisupervised',\n",
       " 'berita-teknologi.json.nested.semisupervised',\n",
       " 'gaya-hidup.json.nested.semisupervised']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "files = sorted(glob('*.nested.semisupervised'))\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(files[0]) as fopen:\n",
    "    for l in fopen:\n",
    "        data = json.loads(l)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "9622it [01:11, 134.35it/s]\n",
      "7897it [00:42, 184.82it/s]\n",
      "9956it [01:13, 135.53it/s]\n",
      "8578it [00:59, 143.62it/s]\n",
      "9970it [01:11, 139.64it/s]\n",
      "9826it [01:06, 148.31it/s]\n",
      "972it [00:06, 139.42it/s]\n",
      "5342it [00:46, 114.30it/s]\n"
     ]
    }
   ],
   "source": [
    "before, after = [], []\n",
    "\n",
    "count = 0\n",
    "rejected = []\n",
    "languages = []\n",
    "accepted = []\n",
    "para = []\n",
    "malaysian_news = {\n",
    "    'kosmo',\n",
    "    'hmetro',\n",
    "    'malaymail',\n",
    "    'projekmm',\n",
    "    'bharian',\n",
    "    'utusan',\n",
    "    'astroawani',\n",
    "    'themalaysianinsight',\n",
    "    'malaysiakini',\n",
    "    'bernama'\n",
    "}\n",
    "\n",
    "def reject(data):\n",
    "    if data['news'] in malaysian_news:\n",
    "        return False\n",
    "    if any([n in data['top-image'] for n in malaysian_news]):\n",
    "        return False\n",
    "    if any([n in data['url'] for n in malaysian_news]):\n",
    "        return False\n",
    "    if 'com.my' in data['top-image']:\n",
    "        return False\n",
    "    if data['language'] == 'malay':\n",
    "        return False\n",
    "    if 'Siaran Pers' in data['news']:\n",
    "        return True\n",
    "    if '.id' in data['news']:\n",
    "        return True\n",
    "    \n",
    "    return True\n",
    "\n",
    "for f in files:\n",
    "    with open(f) as fopen:\n",
    "        for l in tqdm(fopen):\n",
    "            data = json.loads(l)\n",
    "            soup = BeautifulSoup(data['r']['response']['articleBody'], \"lxml\")\n",
    "            text = BeautifulSoup(soup.text, 'lxml').text\n",
    "            text = re.sub(r'[ ]+', ' ', text).strip()\n",
    "            if 'kindly register' in text.lower() or 'disabled in your browser' in text.lower():\n",
    "                continue\n",
    "            if len(text.split()) < 30:\n",
    "                continue\n",
    "\n",
    "            # accepted.append(data)\n",
    "            summaries = [malaya.text.function.remove_empty_parenthesis(s) for s in data['semisupervised-summaries']]\n",
    "            for s in summaries:\n",
    "                if len(s.split()) > 20:\n",
    "                    accepted.append((s, text))\n",
    "\n",
    "            keywords_rake = malaya.keyword.extractive.rake(text, \n",
    "                                                      top_k = random.randint(25, 50))\n",
    "            keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) >= 3 and len(k[1]) > 20 \\\n",
    "                            and len(set(k[1].lower().replace('-', '').split()) & months) == 0]\n",
    "\n",
    "            already = set()\n",
    "            filtered = []\n",
    "            for k in keywords_rake:\n",
    "                if k.lower() in already:\n",
    "                    continue\n",
    "                else:\n",
    "                    already.add(k.lower())\n",
    "                    filtered.append(k)\n",
    "\n",
    "            keywords_rake = filtered\n",
    "\n",
    "            if len(keywords_rake) > 5:\n",
    "                # print(keywords_rake)\n",
    "                accepted.append(('. '.join(keywords_rake), text))\n",
    "\n",
    "    #         if random.random() > 0.1:\n",
    "    #             continue\n",
    "\n",
    "    #         try:\n",
    "    #             extractive = model_tfidf.sentence_level(data['text'], top_k = random.randint(3, 5))['summary']\n",
    "    #             splitted = malaya.text.function.split_into_sentences(data['text'])\n",
    "    #             if len(malaya.text.function.split_into_sentences(extractive)) <= (len(splitted) / 2):\n",
    "    #                 accepted.append((extractive, data['text']))\n",
    "    #         except Exception as e:\n",
    "    #             print(e)\n",
    "\n",
    "            count += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('news-astroawani.json', 'w') as fopen:\n",
    "    json.dump(accepted, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "178407"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(accepted)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
